################################################################################################
# Ambulance Project: Main Regression File
# John Graves
# Created 25 May, 2016
# Final Version: 23 April 2018 (RESTAT Accepted Article)
################################################################################################

# Note: to install packages, use the following
#install.packages('package_name', dependencies=TRUE, repos='http://cran.us.r-project.org')

####
##
# Clearout memory and load necessary packages

rm(list=ls())
# Initial RESTAT
dataver = "4-1"
version = "4-2"

# RESTAT RESUBMISSION
dataver = "5-0"
version = "5-0"

# Nondiscretionary condition cohort
cohort = "ndscv220"

options("scipen"=15, "digits"=4)
pkg = list("haven","foreign","ggplot2","plyr","ggthemes","stringr","Hmisc","AER","dplyr","lazyeval","stargazer","clusterSEs")
invisible(lapply(pkg, require, character.only = TRUE))
homedir = "~/hospital-quality/analysis"
setwd(homedir); list.files()

# Condition-specific samples
if (cohort == "cms20")
  {
    load(file=paste0("../data/analytic-file-",cohort,"ami--ver-",dataver,".Rdata"))
    AA.ami = AA
    load(file=paste0("../data/analytic-file-",cohort,"pn--ver-",dataver,".Rdata"))
    AA.pn = AA
    load(file=paste0("../data/analytic-file-",cohort,"hf--ver-",dataver,".Rdata"))
    AA.hf = AA
  }

# Sensitivity Analysis: Files with No AMI, HF or PN patients
load(paste0("../data/analytic-file-orig",cohort,"no-ami-pn-hf-ver-",version,".Rdata"))
AA.Orig.nonCMS <- AA.Orig
load(paste0("../data/analytic-file-",cohort,"no-ami-pn-hf-ver-",dataver,".Rdata"))
AA.nonCMS = AAdm
load(paste0("../data/analytic-file-",cohort,"no-ami-pn-hf-ver-",version,"-1year.Rdata"))
AA1y.nonCMS = AAdm1y %>% select(-year13)

# Main File
load(file=paste0("../data/analytic-file-orig",cohort,"-ver-",dataver,".Rdata"))
# 30-Day outcomes
load(file=paste0("../data/analytic-file-",cohort,"-ver-",dataver,".Rdata"))
AA <- AAdm
# 1 Year Outcomes
load(file=paste0("../data/analytic-file-",cohort,"-ver-",dataver,"-1year.Rdata"))
AA1y <- AAdm1y %>% select(-year13)

set.seed(123)

####
##
# Covariate Sets

#if (cohort == "cms20") XX.Yr = paste0("year",7:12)
# Year
XX.Yr = names(AA)[grep("^year",names(AA))]
# Demographics
XX.D = c("ag70t74", "ag75t79", "ag80t84", "ag85t89", "ag90t94", "ag95p", "male", "black", "race_other")
# Ambulance
XX.A = c("amb_miles","amb_als","amb_emergency","amb_iv","amb_intubate","amb_op","amb_pmt")  #JAG Added AMB_PMT 3/9/18 to include in both results and balance.
# Comorbidities
XX.C = names(AA)[grep("como_",names(AA))]
# Principal Diagnosis
XX.Dx = names(AA)[grep("^dx",names(AA))][-1]

# All 
XX = c(XX.Yr,XX.D,XX.A,XX.C,XX.Dx); XX

# Covariates Sets
cv1 = c(XX.Yr,XX.Dx,XX.D,XX.A)
cv2 = c(cv1,XX.C)

xx = "Hmort30"
yy = "death30"
y = "death30"
cv = cv2
XX = AA
XX1y = AA1y

# Get missingness on covariates
cv = cv2
for (Var in cv) {
  missing <- sum(is.na(AA[,Var]))
  if (missing > 0) {
    print(c(Var,missing))
  }
}

# Function to fit the regressions to the ambulance data
# xx = the quality measure
# yy = the outcome
# cv = additional rhs variables
# XX = the input data frame (30-day outcomes)
# XX1 = the input data frame (1-year outcomes)
# clustervar = The clustering variable (i.e., HSA)

fitamb = function(xx,yy,cv=cv4,XX = AA,XX1y=AA1y,clustervar="hsanum")
  {

    # Covariate Sets
    fs = iv = ols = sec = sec.ols = nn = list()
    zz = paste0(xx,"_jkf.a")

    print(paste0("++++++++++++",xx,"++++++++++++++"))

    XX.Orig = XX
    
    for (y in yy)
      {
        if (y=="death365")
          {
            XX <- XX1y
            cvyrs <- cv[grep("year",cv)]
            if (length(cvyrs)>0)
              {
                cv2 <- cv[-which(cv==cvyrs[length(cvyrs)])]
                cv <- cv2
              } 
          }
        print(paste0("***********",y,"**********"))
        
        fmla.fs = as.formula(paste0(xx," ~ ",zz," + ", paste(cv,collapse="+")))        
        fmla.ols = as.formula(paste0(y," ~ ",xx,"+ ", paste(cv,collapse="+")))
        
        fmla.iv = as.formula(paste0(y," ~ ",xx,"+",paste(cv,collapse="+")," | " , xx,"_jkf.a + ", paste(cv,collapse="+")))
        fmla.rf = as.formula(paste0(y," ~ ",xx,"_jkf.a + ", paste(cv,collapse="+")))
        fiv = ivreg(fmla.iv,data=XX)
        fols = lm(fmla.ols,data=XX)
        
        # Cluster Standard Errors
        library(sandwich)
        library(lmtest)

        source("./sub-files/cluster-se.r")
        
        XX2 <- XX %>% data.frame()
        sec[[y]] <- coeftest.cluster(XX2,fiv, cluster1=clustervar)
        sec.ols[[y]] <- coeftest.cluster(XX2,fols, cluster1=clustervar)
        
        if (!grepl("_orig",xx)) {
          mf = as.data.frame(expand.model.frame(fiv,as.formula(paste0("~",xx,"_orig"))))
          bary = mean(model.frame(mf)[,paste0(xx,"_orig")]); bary
          sdy = sd(model.frame(mf)[,paste0(xx,"_orig")]); sdy        
          
        } else {
          mf = as.data.frame(expand.model.frame(fiv,as.formula(paste0("~",xx))))
          bary = mean(model.frame(mf)[,paste0(xx)]); bary
          sdy = sd(model.frame(mf)[,paste0(xx)]); sdy        
        }
        
        iv[[y]] = as.data.frame(summary(fiv)$coefficients)
        XX2 = model.frame(fiv)

        nn[[y]] = nrow(XX2)
        
        ffs = lm(fmla.fs,data=XX2)        
        fs[[y]] = as.data.frame(summary(ffs)$coefficients)
        print(paste0(y,": First Stage"))
        print(round(fs[[y]][2,],3))
        
        fols = lm(fmla.ols,data=XX2)
        XX3 <- XX2 %>% data.frame()
        
        ols[[y]] = as.data.frame(summary(fols)$coefficients)
        print(paste0(y,": OLS"))
        print(round(ols[[y]][2,],3))
        print(paste0(y,": 2SLS"))        
        print(round(iv[[y]][2,],3))
        print("")
        print("")
        print(paste0("n = ",nrow(XX2)))
        XX <- XX.Orig
      }
    out = list(fs = fs , ols = ols , iv = iv , nn , stats = c(bary,sdy) ,cv = cv, ses = sec, ses.ols = sec.ols)
    return(out)
  }


# Function to test for balance across the instrument (similar specification as the reduced form for the main regressions --
# this function just speeds up estimation since we don't need to fit 2SLS for this.

# xx = the rhs varible (i.e., the intstrument in this case) 
# yy = the outcome
# XX = the data frame
# clustervar = the clustering variable

fitbal = function(xx,yy,cv=cv4,XX = AA,clustervar="hsanum")
  {

    # Covariate Sets
    rf  = sec =  nn = list()
    zz = paste0(xx,"_jkf.a")

    print(paste0("++++++++++++",xx,"++++++++++++++"))

    XX.Orig = XX
    
    for (y in yy)
      {
        print(paste0("***********",y,"**********"))
        fmla.rf = as.formula(paste0(y," ~ ",xx,"_jkf.a + ", paste(cv,collapse="+")))
        frf = lm(fmla.rf,data=XX)
        
        # Cluster Standard Errors
        library(sandwich)
        library(lmtest)
        source("./sub-files/cluster-se.r")
        
        XX2 <- XX %>% data.frame()
        sec[[y]] <- coeftest.cluster(XX2,frf, cluster1=clustervar)

        if (!grepl("_orig",xx)) {
          mf = as.data.frame(expand.model.frame(frf,as.formula(paste0("~",xx,"_orig"))))
          bary = mean(model.frame(mf)[,paste0(xx,"_orig")]); bary
          sdy = sd(model.frame(mf)[,paste0(xx,"_orig")]); sdy        
          
        } else {
          mf = as.data.frame(expand.model.frame(frf,as.formula(paste0("~",xx))))
          bary = mean(model.frame(mf)[,paste0(xx)]); bary
          sdy = sd(model.frame(mf)[,paste0(xx)]); sdy        
        }
        rf[[y]] = as.data.frame(summary(frf)$coefficients)
        XX2 = model.frame(frf)

        nn[[y]] = nrow(XX2)
        print(paste0(y,": BAL"))
        print(round(rf[[y]][2,],3))
        print("")
        print("")
        print(paste0("n = ",nrow(XX2)))
        # for clustered standard errors cluster.bs.c <- cluster.bs.ivreg(fm, dat = CigarettesSW, cluster = ~state, report = T)
        XX <- XX.Orig
      }
    out = list(rf = rf , nn =  nn , stats = c(bary,sdy) ,cv = cv, ses = sec)
    return(out)
    
  }

# Outcomes
YY = c("death365","readmit30","death30")

# RESTAT QUALITY MEAUSURES
QQc = c("Hmort30","Hreadm30","satis","process","Hmort30_abs","Hreadm30_abs","satis_abs","process_abs","Hmort30C","Hreadm30C","satisC","processC")
QQprocess.indiv = c("ami2","hf1","hf2","hf3","pn6","inf1","inf3","ami4","pn4","hf4","pn2","pn7","ami1")
QQ = c(QQc,QQprocess.indiv)

# Fit blance regressions (iterates over each instrument for each quality measure)
balcvs <- cv2[-grep("^year",cv2)]
balcvs <- c(balcvs,"amb_pmt") # Include Ambulance payment in measures

for (qq in QQ) {
  if (!(file.exists(paste0("../results/",cohort,"-ver-",version,"/","balance-XXX",qq,".Rdata")))) {
  balance <- list()
  for (yyy in balcvs)
    {
      cv.temp <- cv2[-grep(yyy,cv2)]
      if (grepl("^ag",yyy)) {
        balance[[yyy]] <- fitbal(xx = qq, yy = yyy, cv = cv.temp[-grep("^ag",cv.temp)], clustervar="hsanum")
      }
      if (yyy == "black") {
        balance[[yyy]] <- fitbal(xx = qq, yy = yyy, cv = cv.temp[-grep("race",cv.temp)], clustervar="hsanum")
      }
      if (yyy == "race_other") {
        balance[[yyy]] <- fitbal(xx = qq, yy = yyy, cv = cv.temp[-grep("black",cv.temp)], clustervar="hsanum")
      }
      if (grepl("^dx",yyy)) {
        balance[[yyy]] <- fitbal(xx = qq, yy = yyy, cv = cv.temp[-grep("^dx",cv.temp)], clustervar="hsanum")
      } 
      if (!grepl("^dx|^ag|black|race_other",yyy)) {
        balance[[yyy]] <- fitbal(xx = qq, yy = yyy, cv = cv.temp, clustervar="hsanum")
      }
    }
  assign(paste0("b",qq),balance)
  save(list=paste0("b",qq),file=paste0("../results/",cohort,"-ver-",version,"/","balance-",qq,".Rdata"))
  }
}

# Fit main regressions
for (qq in QQ)
{
  print(qq)
  if (file.exists(paste0("../results/",cohort,"-ver-",version,"/","fit-",qq,".Rdata"))==FALSE)
    {     
      f1  = fitamb(xx = qq, cv = cv1, y = YY)
      f2  = fitamb(xx = qq, cv = cv2, y = YY)
      assign(paste0("f",qq,"1"),f1)
      assign(paste0("f",qq,"2"),f2)
      f = list(get(paste0("f",qq,"1")),
        get(paste0("f",qq,"2")))
      assign(paste0("f",qq),f)
      save(list=paste0("f",qq),file=paste0("../results/",cohort,"-ver-",version,"/","fit-",qq,".Rdata"))
    }
}

# RESTAT ROBUSTNESS: EXCLUDE AMI, PN AND HF PATIENTS
for (qq in QQ)
{
  print(qq)
  if (file.exists(paste0("../results/",cohort,"no-ami-pn-hf-ver-",version,"/","fit-",qq,".Rdata"))==FALSE)
    {     
      f1  = fitamb(xx = qq, cv = cv1, y = YY, XX = AA.nonCMS,XX1y=AA1y.nonCMS)
      f2  = fitamb(xx = qq, cv = cv2, y = YY, XX = AA.nonCMS,XX1y=AA1y.nonCMS)
      assign(paste0("f",qq,"1"),f1)
      assign(paste0("f",qq,"2"),f2)
      f = list(get(paste0("f",qq,"1")),
        get(paste0("f",qq,"2")))
      assign(paste0("f",qq),f)
      save(list=paste0("f",qq),file=paste0("../results/",cohort,"no-ami-pn-hf-ver-",version,"/","fit-",qq,".Rdata"))
    }
}

# Function to fit a horserace regression instrumenting for multiple quality measures.
fithr = function(xx,yy,cv=cv4,XX = AA,XX1y=AA1y)
  {

    # Covariate Sets
    fs = iv = ols = sec = sec.ols = list()
    zz = paste0(xx,"_jkf.a")

    print(paste0("++++++++++++",xx,"++++++++++++++"))
    for (y in yy)
      {
        if (y=="death365")
          {
            XX <- XX1y
            cvyrs <- cv[grep("year",cv)]
            cv2 <- cv[-which(cv==cvyrs[length(cvyrs)])]
            cv <- cv2
          }
        print(paste0("***********",y,"**********"))
        

        fmla.ols = as.formula(paste0(y," ~ ",paste(xx,collapse="+"),"+ ", paste(cv,collapse="+")))
        fmla.iv = as.formula(paste0(y," ~ ",paste(xx,collapse="+"),"+",paste(cv,collapse="+")," | " , paste(paste0(xx,"_jkf.a"),collapse="+"), "+",paste(cv,collapse="+")))
        fmla.rf = as.formula(paste0(y," ~ ",paste(paste0(xx,"_jkf.a"),collapse="+"),"+ ", paste(cv,collapse="+")))
        fiv = ivreg(fmla.iv,data=XX)
        fols = lm(fmla.ols,data=XX)        
        # Cluster Standard Errors
        library(sandwich)
        library(lmtest)

        source("./sub-files/cluster-se.r")
        
        XX2 <- XX %>% data.frame()
        sec[[y]] <- coeftest.cluster(XX2,fiv, cluster1="hsanum")

        sec.ols[[y]] <- coeftest.cluster(XX2,fols, cluster1="hsanum")
        if (!grepl("_orig",xx)) {
          mf = as.data.frame(expand.model.frame(fiv,as.formula(paste0("~",xx,"_orig"))))
        } else {
          mf = as.data.frame(expand.model.frame(fiv,as.formula(paste0("~",xx))))
        }
        
        iv[[y]] = as.data.frame(summary(fiv)$coefficients)
        XX2 = model.frame(fiv)
        fols = lm(fmla.ols,data=XX2)        
        ols[[y]] = as.data.frame(summary(fols)$coefficients)
        print(paste0(y,": OLS"))
        print(round(ols[[y]][2:5,],3))
        print(paste0(y,": 2SLS"))        
        print(round(iv[[y]][2:5,],3))
        print("")
        print("")
        print(paste0("n = ",nrow(XX2)))

      }
    out = list(ols = ols , iv = iv , n = nrow(XX2) ,cv = cv, ses = sec)
    return(out)
    
  }


# Fit the horserace regressions.

  if (file.exists(paste0("../results/",cohort,"-ver-",version,"/","fit-hr",".Rdata"))==FALSE)
    {     
      f1  = fithr(xx = HR, cv = cv1, y = YY)
      f2  = fithr(xx = HR, cv = cv2, y = YY)
      assign(paste0("f","hr","1"),f1)
      assign(paste0("f","hr","2"),f2)
      f = list(get(paste0("f","hr","1")),
        get(paste0("f","hr","2")))
      assign(paste0("f","hr"),f)
      save(list=paste0("f","hr"),file=paste0("../results/",cohort,"-ver-",version,"/","fit-","hr",".Rdata"))
    }


